1 Setup and Data Loading

1.1 Load Packages

library(tidyverse)
library(janitor)
library(knitr)
library(here)

# Optional packages
has_naniar <- suppressWarnings(require(naniar, quietly = TRUE))
has_skimr <- suppressWarnings(require(skimr, quietly = TRUE))
has_gt <- suppressWarnings(require(gt, quietly = TRUE))

1.2 Load Data

# File paths using here() for robust path resolution
primary_file <- here("data", "grf_shooting_clean.csv")
fallback_file <- here("data", "GRF Shooting SM[40].csv")

# Load data with fallback
if (file.exists(primary_file)) {
  df <- read_csv(primary_file, show_col_types = FALSE)
  cat("Loaded primary file:", primary_file, "\n")
} else if (file.exists(fallback_file)) {
  df <- read_csv(fallback_file, show_col_types = FALSE) %>%
    janitor::clean_names()
  cat("Loaded fallback file:", fallback_file, "\n")
  cat("Applied janitor::clean_names()\n")
} else {
  stop("ERROR: Neither data file found!\n",
       "Expected: ", primary_file, " or ", fallback_file)
}
## Loaded primary file: /Users/samuelmontalvo/Documents/GitHub/basketball_shooting_project/data/grf_shooting_clean.csv
cat("\nDimensions:", nrow(df), "rows x", ncol(df), "columns\n")
## 
## Dimensions: 460 rows x 18 columns

1.3 Column Names

cat("Column names:\n")
## Column names:
cat(paste(names(df), collapse = "\n"))
## player_id
## shot_number
## shot_type
## made
## position
## height
## body_mass
## peak_braking_force
## peak_braking_power
## peak_propulsive_force
## peak_propulsive_power
## left_leg_peak_propulsive_force
## right_leg_peak_propulsive_force
## jump_height
## braking_duration
## propulsive_duration
## displacement_depth
## force_at_min_displacement

1.4 Identify Key Columns

# Helper function to find columns by keyword
find_col <- function(cols, patterns) {
for (p in patterns) {
    matches <- cols[str_detect(tolower(cols), tolower(p))]
    if (length(matches) > 0) return(matches[1])
  }
  return(NA_character_)
}

# Find player_id
player_id_col <- find_col(names(df), c("^player_id$", "^player$", "subject", "athlete", "^pid$", "^id$"))

if (is.na(player_id_col)) {
  stop("ERROR: Could not identify player_id column.\n",
       "Expected column containing: player, pid, athlete, subject, or id")
}

# Rename to player_id if needed
if (player_id_col != "player_id") {
  df <- df %>% rename(player_id = !!sym(player_id_col))
}
df <- df %>% mutate(player_id = as.character(player_id))

# Find shot_type
shot_type_col <- find_col(names(df), c("^shot_type$", "shot.*type", "type.*2.*3"))
has_shot_type <- !is.na(shot_type_col)

if (has_shot_type && shot_type_col != "shot_type") {
  df <- df %>% rename(shot_type = !!sym(shot_type_col))
}
if (has_shot_type) {
  df <- df %>%
    mutate(shot_type = case_when(
      shot_type %in% c(2, "2", "2PT", "2pt") ~ "2PT",
      shot_type %in% c(3, "3", "3PT", "3pt") ~ "3PT",
      TRUE ~ as.character(shot_type)
    ))
}

# Find made/result
made_col <- find_col(names(df), c("^made$", "miss.*made", "result", "outcome", "make"))
has_made <- !is.na(made_col)

if (has_made && made_col != "made") {
  df <- df %>% rename(made = !!sym(made_col))
}
if (has_made) {
  df <- df %>% mutate(made = as.integer(made))
}

# Report findings
cat("Column Mapping:\n")
## Column Mapping:
cat("─────────────────────────────────────\n")
## ─────────────────────────────────────
cat("player_id:", player_id_col, "→ player_id\n")
## player_id: player_id → player_id
cat("shot_type:", ifelse(has_shot_type, shot_type_col, "NOT FOUND"), "\n")
## shot_type: shot_type
cat("made:     ", ifelse(has_made, made_col, "NOT FOUND"), "\n")
## made:      made

2 Data Integrity Checks

2.1 Missingness by Column

missingness_by_col <- df %>%
  summarise(across(everything(), ~ sum(is.na(.x)))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "n_missing") %>%
  mutate(
    n_total = nrow(df),
    pct_missing = round(100 * n_missing / n_total, 2)
  ) %>%
  arrange(desc(pct_missing))

kable(missingness_by_col, caption = "Missingness by Column")
Missingness by Column
column n_missing n_total pct_missing
player_id 0 460 0
shot_number 0 460 0
shot_type 0 460 0
made 0 460 0
position 0 460 0
height 0 460 0
body_mass 0 460 0
peak_braking_force 0 460 0
peak_braking_power 0 460 0
peak_propulsive_force 0 460 0
peak_propulsive_power 0 460 0
left_leg_peak_propulsive_force 0 460 0
right_leg_peak_propulsive_force 0 460 0
jump_height 0 460 0
braking_duration 0 460 0
propulsive_duration 0 460 0
displacement_depth 0 460 0
force_at_min_displacement 0 460 0
p_missing <- missingness_by_col %>%
  filter(pct_missing > 0 | row_number() <= 10) %>%
  ggplot(aes(x = reorder(column, pct_missing), y = pct_missing)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(
    title = "Missing Data by Column",
    x = NULL,
    y = "% Missing"
  ) +
  theme_minimal()

print(p_missing)

2.2 Duplicate Rows

n_duplicates <- sum(duplicated(df))
cat("Number of fully duplicated rows:", n_duplicates, "\n")
## Number of fully duplicated rows: 0
if ("shot_number" %in% names(df) && has_shot_type) {
  dup_trials <- df %>%
    group_by(player_id, shot_type, shot_number) %>%
    filter(n() > 1) %>%
    nrow()
  cat("Duplicate (player_id, shot_type, shot_number) combinations:", dup_trials, "\n")
}
## Duplicate (player_id, shot_type, shot_number) combinations: 0

2.3 Trial Balance

trials_per_player <- df %>%
  count(player_id, name = "n_trials") %>%
  arrange(player_id)

kable(trials_per_player, caption = "Trials per Player")
Trials per Player
player_id n_trials
1 20
10 20
11 20
12 20
13 20
14 20
15 20
16 20
17 20
18 20
19 20
2 20
20 20
21 20
22 20
23 20
3 20
4 20
5 20
6 20
7 20
8 20
9 20
trials_by_type <- df %>%
  count(player_id, shot_type) %>%
  pivot_wider(names_from = shot_type, values_from = n, values_fill = 0) %>%
  mutate(
    total = `2PT` + `3PT`,
    balanced = ifelse(`2PT` == 10 & `3PT` == 10, "✓", "✗")
  )

kable(trials_by_type, caption = "Trials by Shot Type (Expected: 10/10)")
Trials by Shot Type (Expected: 10/10)
player_id 2PT 3PT total balanced
1 10 10 20
10 10 10 20
11 10 10 20
12 10 10 20
13 10 10 20
14 10 10 20
15 10 10 20
16 10 10 20
17 10 10 20
18 10 10 20
19 10 10 20
2 10 10 20
20 10 10 20
21 10 10 20
22 10 10 20
23 10 10 20
3 10 10 20
4 10 10 20
5 10 10 20
6 10 10 20
7 10 10 20
8 10 10 20
9 10 10 20
complete_players <- sum(trials_by_type$balanced == "✓")
cat("\nPlayers with complete data (10 2PT + 10 3PT):", complete_players, "of", nrow(trials_by_type), "\n")
## 
## Players with complete data (10 2PT + 10 3PT): 23 of 23

3 Numeric Variable Overview

3.1 Summary Statistics

# Identify numeric columns
exclude_cols <- c("player_id", "shot_type", "made", "position")
numeric_cols <- df %>%
  select(-any_of(exclude_cols)) %>%
  select(where(is.numeric)) %>%
  names()

cat("Identified", length(numeric_cols), "numeric columns\n")
## Identified 14 numeric columns
# Create summary table
numeric_summary <- df %>%
  select(all_of(numeric_cols)) %>%
  summarise(across(everything(), list(
    mean = ~ mean(.x, na.rm = TRUE),
    sd = ~ sd(.x, na.rm = TRUE),
    median = ~ median(.x, na.rm = TRUE),
    iqr = ~ IQR(.x, na.rm = TRUE),
    min = ~ min(.x, na.rm = TRUE),
    max = ~ max(.x, na.rm = TRUE),
    p1 = ~ quantile(.x, 0.01, na.rm = TRUE),
    p99 = ~ quantile(.x, 0.99, na.rm = TRUE),
    pct_na = ~ round(100 * sum(is.na(.x)) / length(.x), 2)
  ))) %>%
  pivot_longer(
    everything(),
    names_to = c("variable", "stat"),
    names_pattern = "^(.*)_(mean|sd|median|iqr|min|max|p1|p99|pct_na)$"
  ) %>%
  pivot_wider(names_from = stat, values_from = value) %>%
  mutate(across(where(is.numeric), ~ round(.x, 2))) %>%
  arrange(variable)

kable(numeric_summary, caption = "Numeric Variable Summary Statistics")
Numeric Variable Summary Statistics
variable mean sd median iqr min max p1 p99 pct_na
body_mass 91.29 12.77 90.68 12.07 68.88 123.97 68.88 123.97 0
braking_duration 0.42 0.47 0.29 0.12 0.08 2.81 0.11 2.69 0
displacement_depth 21.95 19.42 17.70 8.05 0.00 184.80 0.79 100.73 0
force_at_min_displacement 1500.07 381.29 1518.00 396.50 7.90 2643.00 11.20 2333.87 0
height 195.35 6.49 195.00 10.00 180.00 206.00 180.00 206.00 0
jump_height 9.69 5.05 8.90 7.40 1.30 27.10 1.60 23.08 0
left_leg_peak_propulsive_force 1048.32 201.98 1018.00 302.00 609.00 1567.00 653.95 1513.97 0
peak_braking_force 1532.33 308.83 1520.50 378.75 827.00 2643.00 979.16 2333.87 0
peak_braking_power 601.14 292.79 594.50 379.25 -0.14 2131.00 59.36 1540.15 0
peak_propulsive_force 1959.21 343.43 1911.50 529.75 1099.00 2914.00 1194.96 2827.27 0
peak_propulsive_power 2753.89 734.59 2711.00 975.25 544.00 7311.00 1326.22 4580.10 0
propulsive_duration 0.31 0.25 0.29 0.08 0.14 2.71 0.18 1.47 0
right_leg_peak_propulsive_force 940.41 167.93 926.50 207.25 568.00 1513.00 603.85 1421.17 0
shot_number 5.50 2.88 5.50 5.00 1.00 10.00 1.00 10.00 0

4 Outlier Screening

4.1 Global Outliers (IQR Rule)

global_outliers <- tibble(
  column = character(),
  n_outliers = integer(),
  pct_outliers = numeric()
)

for (col in numeric_cols) {
  x <- df[[col]]
  q1 <- quantile(x, 0.25, na.rm = TRUE)
  q3 <- quantile(x, 0.75, na.rm = TRUE)
  iqr <- q3 - q1
  lower <- q1 - 1.5 * iqr
  upper <- q3 + 1.5 * iqr

  n_out <- sum(x < lower | x > upper, na.rm = TRUE)
  pct_out <- round(100 * n_out / sum(!is.na(x)), 2)

  global_outliers <- global_outliers %>%
    add_row(column = col, n_outliers = n_out, pct_outliers = pct_out)
}

4.2 Within-Player Outliers (|z| > 3)

within_player_outliers <- tibble(
  column = character(),
  n_outliers = integer()
)

# Players with >= 5 trials
valid_players <- df %>%
  count(player_id) %>%
  filter(n >= 5) %>%
  pull(player_id)

df_valid <- df %>% filter(player_id %in% valid_players)

for (col in numeric_cols) {
  z_scores <- df_valid %>%
    group_by(player_id) %>%
    mutate(
      col_mean = mean(.data[[col]], na.rm = TRUE),
      col_sd = sd(.data[[col]], na.rm = TRUE),
      z = ifelse(col_sd > 0, (.data[[col]] - col_mean) / col_sd, 0)
    ) %>%
    ungroup() %>%
    pull(z)

  n_out <- sum(abs(z_scores) > 3, na.rm = TRUE)

  within_player_outliers <- within_player_outliers %>%
    add_row(column = col, n_outliers = n_out)
}

4.3 Combined Outlier Summary

outlier_summary <- global_outliers %>%
  rename(global_outliers = n_outliers, global_pct = pct_outliers) %>%
  left_join(
    within_player_outliers %>% rename(within_player_outliers = n_outliers),
    by = "column"
  ) %>%
  arrange(desc(global_outliers + within_player_outliers))

kable(outlier_summary, caption = "Outlier Summary by Column")
Outlier Summary by Column
column global_outliers global_pct within_player_outliers
displacement_depth 59 12.83 8
braking_duration 54 11.74 9
body_mass 40 8.70 0
force_at_min_displacement 18 3.91 3
propulsive_duration 11 2.39 6
peak_braking_force 10 2.17 4
peak_braking_power 7 1.52 4
right_leg_peak_propulsive_force 10 2.17 0
peak_propulsive_power 4 0.87 3
jump_height 3 0.65 0
shot_number 0 0.00 0
height 0 0.00 0
peak_propulsive_force 0 0.00 0
left_leg_peak_propulsive_force 0 0.00 0

5 Distributions

5.1 Select Key Variables

key_keywords <- c("force", "power", "jump", "brak", "propul", "duration", "depth")

# Get keyword matches
keyword_matches <- numeric_cols[str_detect(tolower(numeric_cols),
                                           paste(key_keywords, collapse = "|"))]

# Get top 10 by variance
var_ranking <- df %>%
  select(all_of(numeric_cols)) %>%
  summarise(across(everything(), ~ var(.x, na.rm = TRUE))) %>%
  pivot_longer(everything(), names_to = "column", values_to = "variance") %>%
  arrange(desc(variance))

top_by_variance <- var_ranking %>% slice_head(n = 10) %>% pull(column)

# Combine
key_vars <- unique(c(keyword_matches, top_by_variance))
key_vars <- key_vars[1:min(12, length(key_vars))]

cat("Key variables selected for visualization:\n")
## Key variables selected for visualization:
cat(paste(key_vars, collapse = "\n"))
## peak_braking_force
## peak_braking_power
## peak_propulsive_force
## peak_propulsive_power
## left_leg_peak_propulsive_force
## right_leg_peak_propulsive_force
## jump_height
## braking_duration
## propulsive_duration
## displacement_depth
## force_at_min_displacement
## body_mass

5.2 Overall Distributions

for (var in key_vars) {
  p <- df %>%
    ggplot(aes(x = .data[[var]])) +
    geom_histogram(aes(y = after_stat(density)), bins = 30,
                   fill = "steelblue", alpha = 0.7) +
    geom_density(color = "darkblue", linewidth = 1) +
    labs(title = paste("Distribution:", var), x = var, y = "Density") +
    theme_minimal()

  print(p)
}

5.3 Distributions by Shot Type

for (var in key_vars) {
  cat("\n\n### ", var, "\n\n")

  p <- df %>%
    ggplot(aes(x = shot_type, y = .data[[var]], fill = shot_type)) +
    geom_violin(alpha = 0.5) +
    geom_boxplot(width = 0.2, alpha = 0.8, outlier.shape = NA) +
    geom_jitter(width = 0.1, alpha = 0.3, size = 1) +
    scale_fill_manual(values = c("2PT" = "#1f77b4", "3PT" = "#ff7f0e")) +
    labs(title = paste(var, "by Shot Type"), x = "Shot Type", y = var) +
    theme_minimal() +
    theme(legend.position = "none")

  print(p)
}

5.3.1 peak_braking_force

5.3.2 peak_braking_power

5.3.3 peak_propulsive_force

5.3.4 peak_propulsive_power

5.3.5 left_leg_peak_propulsive_force

5.3.6 right_leg_peak_propulsive_force

5.3.7 jump_height

5.3.8 braking_duration

5.3.9 propulsive_duration

5.3.10 displacement_depth

5.3.11 force_at_min_displacement

5.3.12 body_mass

6 Player-Level Variability

jump_col <- find_col(numeric_cols, c("jump_height", "jump", "height"))
has_jump <- !is.na(jump_col)

if (has_jump) {
  cat("Jump height variable identified:", jump_col, "\n")
} else {
  cat("No jump height variable found\n")
}
## Jump height variable identified: jump_height

6.1 Paired Dot Plot: 2PT vs 3PT

player_means <- df %>%
  group_by(player_id, shot_type) %>%
  summarise(mean_value = mean(.data[[jump_col]], na.rm = TRUE), .groups = "drop")

player_wide <- player_means %>%
  pivot_wider(names_from = shot_type, values_from = mean_value)

p_paired <- player_wide %>%
  ggplot(aes(x = `2PT`, y = `3PT`)) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "gray50") +
  geom_point(size = 4, alpha = 0.7, color = "steelblue") +
  labs(
    title = paste("Player Mean", jump_col, ": 2PT vs 3PT"),
    subtitle = "Points above dashed line = higher 3PT than 2PT",
    x = "Mean 2PT Jump Height",
    y = "Mean 3PT Jump Height"
  ) +
  theme_minimal() +
  coord_equal()

print(p_paired)

6.2 Spaghetti Plot

p_spaghetti <- player_means %>%
  ggplot(aes(x = shot_type, y = mean_value, group = player_id, color = player_id)) +
  geom_line(alpha = 0.7, linewidth = 1) +
  geom_point(size = 3) +
  labs(
    title = paste("Player-Level Mean", jump_col, "by Shot Type"),
    x = "Shot Type",
    y = paste("Mean", jump_col),
    color = "Player"
  ) +
  theme_minimal()

print(p_spaghetti)

7 Correlation Structure

7.1 Correlation Matrix

cor_matrix <- df %>%
  select(all_of(numeric_cols)) %>%
  cor(use = "complete.obs")

# Convert to long format
cor_long <- cor_matrix %>%
  as.data.frame() %>%
  rownames_to_column("var1") %>%
  pivot_longer(-var1, names_to = "var2", values_to = "correlation")

7.2 Correlation Heatmap

p_cor <- cor_long %>%
  ggplot(aes(x = var1, y = var2, fill = correlation)) +
  geom_tile() +
  scale_fill_gradient2(
    low = "#2166ac", mid = "white", high = "#b2182b",
    midpoint = 0, limits = c(-1, 1)
  ) +
  labs(title = "Correlation Matrix", x = "", y = "") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1, size = 7),
    axis.text.y = element_text(size = 7)
  )

print(p_cor)

7.3 Strong Correlations (|r| > 0.7)

strong_cors <- cor_long %>%
  filter(var1 < var2) %>%
  filter(abs(correlation) > 0.7) %>%
  arrange(desc(abs(correlation))) %>%
  mutate(correlation = round(correlation, 3))

kable(strong_cors %>% head(20), caption = "Strongest Correlations (|r| > 0.7)")
Strongest Correlations (|r| > 0.7)
var1 var2 correlation
left_leg_peak_propulsive_force peak_propulsive_force 0.923
force_at_min_displacement peak_braking_force 0.877
peak_propulsive_force right_leg_peak_propulsive_force 0.876
peak_braking_force peak_propulsive_force 0.711

8 Shot Success Analysis

overall_make_rate <- mean(df$made, na.rm = TRUE)
cat("Overall make rate:", round(overall_make_rate * 100, 1), "%\n")
## Overall make rate: 64.1 %

8.1 Make Rate by Shot Type

make_by_type <- df %>%
  group_by(shot_type) %>%
  summarise(
    n = n(),
    makes = sum(made, na.rm = TRUE),
    make_rate = round(mean(made, na.rm = TRUE) * 100, 1),
    .groups = "drop"
  )

kable(make_by_type, caption = "Make Rate by Shot Type")
Make Rate by Shot Type
shot_type n makes make_rate
2PT 230 167 72.6
3PT 230 128 55.7
p_make_type <- make_by_type %>%
  ggplot(aes(x = shot_type, y = make_rate, fill = shot_type)) +
  geom_col(alpha = 0.8) +
  geom_hline(yintercept = overall_make_rate * 100, linetype = "dashed", color = "red") +
  scale_fill_manual(values = c("2PT" = "#1f77b4", "3PT" = "#ff7f0e")) +
  labs(title = "Make Rate by Shot Type",
       subtitle = paste("Red line = overall (", round(overall_make_rate * 100, 1), "%)"),
       x = "Shot Type", y = "Make Rate (%)") +
  theme_minimal() +
  theme(legend.position = "none")

print(p_make_type)

8.2 Make Rate by Player

make_by_player <- df %>%
  group_by(player_id) %>%
  summarise(
    n = n(),
    makes = sum(made, na.rm = TRUE),
    make_rate = round(mean(made, na.rm = TRUE) * 100, 1),
    .groups = "drop"
  ) %>%
  arrange(desc(make_rate))

kable(make_by_player, caption = "Make Rate by Player (Ranked)")
Make Rate by Player (Ranked)
player_id n makes make_rate
16 20 16 80
17 20 16 80
9 20 16 80
13 20 15 75
21 20 15 75
3 20 15 75
2 20 14 70
22 20 14 70
1 20 13 65
15 20 13 65
19 20 13 65
6 20 13 65
11 20 12 60
12 20 12 60
14 20 12 60
20 20 12 60
5 20 12 60
7 20 12 60
18 20 11 55
4 20 11 55
10 20 10 50
23 20 9 45
8 20 9 45
p_make_player <- make_by_player %>%
  ggplot(aes(x = reorder(player_id, make_rate), y = make_rate)) +
  geom_col(fill = "steelblue", alpha = 0.8) +
  geom_hline(yintercept = overall_make_rate * 100, linetype = "dashed", color = "red") +
  coord_flip() +
  labs(title = "Make Rate by Player",
       subtitle = paste("Red line = overall (", round(overall_make_rate * 100, 1), "%)"),
       x = "Player ID", y = "Make Rate (%)") +
  theme_minimal()

print(p_make_player)

8.3 Jump Height vs Make Probability

p_jump_make <- df %>%
  ggplot(aes(x = .data[[jump_col]], y = made)) +
  geom_jitter(alpha = 0.4, height = 0.1, width = 0, color = "steelblue") +
  geom_smooth(method = "loess", se = TRUE, color = "red", fill = "red", alpha = 0.2) +
  labs(
    title = paste("Make Probability vs", jump_col, "(Descriptive Only)"),
    subtitle = "Red line = LOESS smooth",
    x = jump_col,
    y = "Made (0/1)"
  ) +
  theme_minimal()

print(p_jump_make)

9 EDA Takeaways

cat("## Summary\n\n")

9.1 Summary

# Completeness
cat("### Data Completeness\n\n")

9.1.1 Data Completeness

high_missing <- missingness_by_col %>% filter(pct_missing > 5)
if (nrow(high_missing) > 0) {
  cat("- **Columns with >5% missing:**", paste(high_missing$column, collapse = ", "), "\n")
} else {
  cat("- ✓ No columns with >5% missing data\n")
}
  • ✓ No columns with >5% missing data
incomplete <- trials_per_player %>% filter(n_trials != 20)
if (nrow(incomplete) > 0) {
  cat("- **Players with incomplete trials:**", paste(incomplete$player_id, collapse = ", "), "\n")
} else {
  cat("- ✓ All players have expected 20 trials\n")
}
  • ✓ All players have expected 20 trials
# Artifact detection
cat("\n### Potential Artifact Variables\n\n")

9.1.2 Potential Artifact Variables

artifact_flags <- numeric_summary %>%
  mutate(p99_p1_ratio = ifelse(p1 != 0 & !is.na(p1), p99 / p1, NA)) %>%
  filter(p99_p1_ratio > 100 | min < 0 | is.na(p99_p1_ratio))

if (nrow(artifact_flags) > 0) {
  cat("- **Flagged variables:**", paste(artifact_flags$variable, collapse = ", "), "\n")
  cat("  - (Extreme p99/p1 ratios or negative minimums)\n")
} else {
  cat("- ✓ No obvious artifact variables detected\n")
}
  • Flagged variables: displacement_depth, force_at_min_displacement, peak_braking_power
    • (Extreme p99/p1 ratios or negative minimums)
# Stability
cat("\n### Variable Stability\n\n")

9.1.3 Variable Stability

cv_summary <- numeric_summary %>%
  mutate(cv = sd / abs(mean)) %>%
  arrange(cv)

stable_vars <- cv_summary %>% filter(cv < 0.3) %>% pull(variable)
noisy_vars <- cv_summary %>% filter(cv > 0.5) %>% pull(variable)

cat("- **Most stable (CV < 0.3):**", paste(head(stable_vars, 5), collapse = ", "), "\n")
  • Most stable (CV < 0.3): height, body_mass, peak_propulsive_force, right_leg_peak_propulsive_force, left_leg_peak_propulsive_force
cat("- **Most variable (CV > 0.5):**", paste(head(noisy_vars, 5), collapse = ", "), "\n")
  • Most variable (CV > 0.5): jump_height, shot_number, propulsive_duration, displacement_depth, braking_duration
# Collinearity
cat("\n### Collinearity Patterns\n\n")

9.1.4 Collinearity Patterns

if (nrow(strong_cors) > 0) {
  cat("- Found **", nrow(strong_cors), " variable pairs** with |r| > 0.7\n")
  cat("- **Top correlations:**\n")
  top_cors <- strong_cors %>% slice_head(n = 5)
  for (i in 1:nrow(top_cors)) {
    cat("  - ", top_cors$var1[i], " ↔ ", top_cors$var2[i],
        " (r = ", top_cors$correlation[i], ")\n", sep = "")
  }
} else {
  cat("- No variable pairs with |r| > 0.7\n")
}
  • Found ** 4 variable pairs** with |r| > 0.7
  • Top correlations:
    • left_leg_peak_propulsive_force ↔︎ peak_propulsive_force (r = 0.923)
    • force_at_min_displacement ↔︎ peak_braking_force (r = 0.877)
    • peak_propulsive_force ↔︎ right_leg_peak_propulsive_force (r = 0.876)
    • peak_braking_force ↔︎ peak_propulsive_force (r = 0.711)
# Shot type differences
if (has_shot_type) {
  cat("\n### Shot Type Observations\n\n")
  cat("- Review violin/boxplots for systematic 2PT vs 3PT differences\n")
  cat("- Jump height is typically higher for 3PT shots (greater distance)\n")
}

9.1.5 Shot Type Observations

  • Review violin/boxplots for systematic 2PT vs 3PT differences
  • Jump height is typically higher for 3PT shots (greater distance)
# Make rate
if (has_made) {
  cat("\n### Make Rate Observations\n\n")
  cat("- **Overall make rate:**", round(overall_make_rate * 100, 1), "%\n")
  cat("- **Player variation:**", round(min(make_by_player$make_rate), 1), "% –",
      round(max(make_by_player$make_rate), 1), "%\n")
}

9.1.6 Make Rate Observations

  • Overall make rate: 64.1 %
  • Player variation: 45 % – 80 %

10 Session Info

sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Tahoe 26.2
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] C.UTF-8/C.UTF-8/C.UTF-8/C/C.UTF-8/C.UTF-8
## 
## time zone: America/Los_Angeles
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] gt_1.3.0        skimr_2.2.2     naniar_1.1.0    here_1.0.2     
##  [5] knitr_1.51      janitor_2.2.1   lubridate_1.9.5 forcats_1.0.1  
##  [9] stringr_1.6.0   dplyr_1.2.0     purrr_1.2.1     readr_2.1.6    
## [13] tidyr_1.3.2     tibble_3.3.1    ggplot2_4.0.2   tidyverse_2.0.0
## 
## loaded via a namespace (and not attached):
##  [1] sass_0.4.10        generics_0.1.4     xml2_1.5.2         lattice_0.22-9    
##  [5] stringi_1.8.7      hms_1.1.4          digest_0.6.39      magrittr_2.0.4    
##  [9] evaluate_1.0.5     grid_4.5.1         timechange_0.4.0   RColorBrewer_1.1-3
## [13] fastmap_1.2.0      Matrix_1.7-4       rprojroot_2.1.1    jsonlite_2.0.0    
## [17] mgcv_1.9-4         scales_1.4.0       jquerylib_0.1.4    cli_3.6.5         
## [21] crayon_1.5.3       rlang_1.1.7        splines_4.5.1      bit64_4.6.0-1     
## [25] base64enc_0.1-6    withr_3.0.2        repr_1.1.7         cachem_1.1.0      
## [29] yaml_2.3.12        otel_0.2.0         parallel_4.5.1     tools_4.5.1       
## [33] tzdb_0.5.0         vctrs_0.7.1        R6_2.6.1           lifecycle_1.0.5   
## [37] snakecase_0.11.1   bit_4.6.0          fs_1.6.6           vroom_1.7.0       
## [41] pkgconfig_2.0.3    pillar_1.11.1      bslib_0.10.0       gtable_0.3.6      
## [45] glue_1.8.0         visdat_0.6.0       xfun_0.56          tidyselect_1.2.1  
## [49] farver_2.1.2       nlme_3.1-168       htmltools_0.5.9    labeling_0.4.3    
## [53] rmarkdown_2.30     compiler_4.5.1     S7_0.2.1